Load the necessary libraries
library(car) #for regression diagnostics
library(broom) #for tidy output
library(ggfortify) #for model diagnostics
library(DHARMa) #for residual diagnostics
library(performance) #for residuals diagnostics
library(see) #for plotting residuals
library(sjPlot) #for outputs
library(knitr) #for kable
library(effects) #for partial effects plots
library(ggeffects) #for partial effects plots
library(emmeans) #for estimating marginal means
library(modelr) #for auxillary modelling functions
library(tidyverse) #for data wrangling
theme_set(theme_classic())
Here is an example from Fowler, Cohen, and Jarvis (1998). An agriculturalist was interested in the effects of fertilizer load on the yield of grass. Grass seed was sown uniformly over an area and different quantities of commercial fertilizer were applied to each of ten 1 m2 randomly located plots. Two months later the grass from each plot was harvested, dried and weighed. The data are in the file fertilizer.csv in the data folder.
| fert | yield |
|---|---|
| 25 | 84 |
| 50 | 80 |
| 75 | 90 |
| 100 | 154 |
| 125 | 148 |
| ... | ... |
| fert: Mass o | f fertilizer (g.m-2) - Predictor variable |
| yield: | Yield of grass (g.m-2) - Response variable |
The aim of the analysis is to investigate the relationship between fertilizer concentration and grass yield.
fert = read_csv('../data/fertilizer.csv', trim_ws=TRUE)
## Parsed with column specification:
## cols(
## FERTILIZER = col_double(),
## YIELD = col_double()
## )
fert <- rename(fert, yield = YIELD, fert = FERTILIZER)
glimpse(fert)
## Rows: 10
## Columns: 2
## $ fert <dbl> 25, 50, 75, 100, 125, 150, 175, 200, 225, 250
## $ yield <dbl> 84, 80, 90, 154, 148, 169, 206, 244, 212, 248
## Explore the first 6 rows of the data
head(fert)
str(fert)
## tibble [10 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ fert : num [1:10] 25 50 75 100 125 150 175 200 225 250
## $ yield: num [1:10] 84 80 90 154 148 169 206 244 212 248
## - attr(*, "spec")=
## .. cols(
## .. FERTILIZER = col_double(),
## .. YIELD = col_double()
## .. )
ggplot(fert, aes(x = fert, y = yield)) +
geom_smooth(method = 'lm') +
geom_smooth(se=F, col="red") +
geom_point()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Model formula: \[ y_i \sim{} \mathcal{N}(\mu_i, \sigma^2)\\ \mu_i = \beta_0 + \beta_1 x_i \]
where \(y_i\) represents the \(i\) observed values, \(\beta_0\) and \(\beta_1\) represent the intercept and slope respectively, and \(\sigma^2\) represents the estimated variance.
ggplot(fert, aes(y=yield, x=fert)) +
geom_point() +
geom_smooth()
ggplot(fert, aes(y=yield, x=fert)) +
geom_point() +
geom_smooth(method='lm')
ggplot(fert, aes(y=yield)) +
geom_boxplot(aes(x=1))
fert.lm<-lm(yield~1+fert, data=fert)
fert.lm<-lm(yield~fert, data=fert)
attributes(fert.lm)
## $names
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
##
## $class
## [1] "lm"
names(fert.lm)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
Check if our assumptions really are adequate!
autoplot(fert.lm, which = 1:6, ncol = 2, label.size = 3)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Get the Cook’s d values:
influence.measures(fert.lm)
## Influence measures of
## lm(formula = yield ~ fert, data = fert) :
##
## dfb.1_ dfb.fert dffit cov.r cook.d hat inf
## 1 5.39e-01 -0.4561 0.5411 1.713 0.15503 0.345
## 2 -4.15e-01 0.3277 -0.4239 1.497 0.09527 0.248
## 3 -6.01e-01 0.4237 -0.6454 0.969 0.18608 0.176
## 4 3.80e-01 -0.2145 0.4634 1.022 0.10137 0.127
## 5 -5.77e-02 0.0163 -0.0949 1.424 0.00509 0.103
## 6 -2.50e-02 -0.0141 -0.0821 1.432 0.00382 0.103
## 7 3.61e-17 0.1159 0.2503 1.329 0.03374 0.127
## 8 -2.19e-01 0.6184 0.9419 0.623 0.31796 0.176
## 9 3.29e-01 -0.6486 -0.8390 1.022 0.30844 0.248
## 10 1.51e-01 -0.2559 -0.3035 1.900 0.05137 0.345 *
Leverage is in the hat column
Using performance package:
require(performance)
performance::check_model(fert.lm, check = c("normality", "homogeneity", "ncv"))
## Not enough model terms in the conditional part of the model to check for multicollinearity.
performance::check_outliers(fert.lm)
## OK: No outliers detected.
# Checks for cook's d values and leverage
Using DHARMa package:
require(DHARMa)
fert.resid <- simulateResiduals(fert.lm, plot = T)
Left-hand side: QQ plot * K-S is a test for the distribution fit being significantly different from Gaussian, but is very strict, the test is more robust than this assumption * Don’t have to worry about dispersion for Gaussian * Outlier test determined by Cook’s distance and leverage, but better determined by plotting Cook’s distances
Right-hand side: residual plot
testZeroInflation(fert.resid)
##
## DHARMa zero-inflation test via comparison to expected zeros with
## simulation under H0 = fitted model
##
## data: simulationOutput
## ratioObsSim = NaN, p-value = 1
## alternative hypothesis: two.sided
testResiduals(fert.resid)
## $uniformity
##
## One-sample Kolmogorov-Smirnov test
##
## data: simulationOutput$scaledResiduals
## D = 0.224, p-value = 0.6209
## alternative hypothesis: two-sided
##
##
## $dispersion
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## ratioObsSim = 0.97718, p-value = 0.952
## alternative hypothesis: two.sided
##
##
## $outliers
##
## DHARMa bootstrapped outlier test
##
## data: simulationOutput
## outliers at both margin(s) = 0, observations = 10, p-value = 1
## alternative hypothesis: two.sided
## percent confidence interval:
## 0.0 0.1
## sample estimates:
## outlier frequency (expected: 0.01 )
## 0
## $uniformity
##
## One-sample Kolmogorov-Smirnov test
##
## data: simulationOutput$scaledResiduals
## D = 0.224, p-value = 0.6209
## alternative hypothesis: two-sided
##
##
## $dispersion
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## ratioObsSim = 0.97718, p-value = 0.952
## alternative hypothesis: two.sided
##
##
## $outliers
##
## DHARMa bootstrapped outlier test
##
## data: simulationOutput
## outliers at both margin(s) = 0, observations = 10, p-value = 1
## alternative hypothesis: two.sided
## percent confidence interval:
## 0.0 0.1
## sample estimates:
## outlier frequency (expected: 0.01 )
## 0
Can plot partial plots:
fert.lm %>% plot_model(type = "eff", show.data = T)
## $fert
plot(effects::allEffects(fert.lm, residuals = T))
Looks good!
Using package ggemmeans:
fert.lm %>% ggemmeans(~fert) %>% plot(add.data = T)
Confidence intervals:
confint(fert.lm)
## 2.5 % 97.5 %
## (Intercept) 22.0036246 81.863042
## fert 0.6184496 1.004338
tidy(fert.lm, conf.int = T)
newdata = data.frame(fert = 110)
predict(fert.lm, newdata = newdata)
## 1
## 141.1867
Using package emmeans: (newdata to be a list)
newdata = list(fert = 110)
emmeans(fert.lm, ~fert, at = newdata)
## fert emmean SE df lower.CL upper.CL
## 110 141 6.43 8 126 156
##
## Confidence level used: 0.95
## Input is:
# model
# predictor we're interested in
# at the things held constant we are interested in
# can use either of these:
fert_grid <- fert %>% data_grid(fert = seq_range(fert, n=100))
fert_grid <- expand_grid(fert = seq_range(fert$fert, n=100))
pred_yield <- emmeans(fert.lm, ~ fert, at = fert_grid) %>% as.data.frame %>%
mutate(yield = emmean)
fert %>%
ggplot(aes(x = fert, y = yield)) +
geom_ribbon(data = pred_yield, aes(ymin = lower.CL, ymax = upper.CL), fill = "green", alpha = 0.3) +
geom_point() +
geom_line(data = pred_yield) +
labs(x = expression(Fertilizer~concentration~(g.ml^-1)),
y = expression(Gradd~yield~(g.m^-3)))
Get pair-wise difference calculations:
newdata <- list(fert = c(100,200))
emmeans(fert.lm, ~fert, at = newdata) %>% pairs() %>% confint()
Centre the data (but not scale it):
fert.lm1 <- lm(yield ~ scale(fert, scale=F), data=fert)
summary(fert.lm1)
##
## Call:
## lm(formula = yield ~ scale(fert, scale = F), data = fert)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.79 -11.07 -5.00 12.00 29.79
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 163.50000 6.00813 27.213 3.58e-09 ***
## scale(fert, scale = F) 0.81139 0.08367 9.697 1.07e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19 on 8 degrees of freedom
## Multiple R-squared: 0.9216, Adjusted R-squared: 0.9118
## F-statistic: 94.04 on 1 and 8 DF, p-value: 1.067e-05